1.) Read in the data
defaultW <- getOption("warn")
options(warn = -1)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Reading in the data
retail <- read.csv("/Users/lawrence/Google Drive/DS/Time Series/OnlineRetailUCI2-master/online_retail_II.csv")
head(retail)
## Invoice StockCode Description Quantity
## 1 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12
## 2 489434 79323P PINK CHERRY LIGHTS 12
## 3 489434 79323W WHITE CHERRY LIGHTS 12
## 4 489434 22041 RECORD FRAME 7" SINGLE SIZE 48
## 5 489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24
## 6 489434 22064 PINK DOUGHNUT TRINKET POT 24
## InvoiceDate Price Customer.ID Country
## 1 2009-12-01 07:45:00 6.95 13085 United Kingdom
## 2 2009-12-01 07:45:00 6.75 13085 United Kingdom
## 3 2009-12-01 07:45:00 6.75 13085 United Kingdom
## 4 2009-12-01 07:45:00 2.10 13085 United Kingdom
## 5 2009-12-01 07:45:00 1.25 13085 United Kingdom
## 6 2009-12-01 07:45:00 1.65 13085 United Kingdom
2.) Prepare and comment on the following exploratory plots: Total daily, weekly and monthly sales volumes
#Preparing the data
retail %>%
separate(InvoiceDate, into=c('Date'), sep = " ", remove=F) %>%
separate(InvoiceDate, into=c('MonthYear'), sep = c(7), remove=F) %>%
mutate(Sales = retail$Quantity * retail$Price)-> retail
aggregate(Sales ~ MonthYear, data=retail, FUN=sum) %>% separate(MonthYear, into=c('Year'), sep = c(4), remove=F) -> monthly_yearly
aggregate(Sales ~ Date, data=retail, FUN=sum) -> daily
val<-c(mean(daily$Sales),median(daily$Sales),max(daily$Sales),min(daily$Sales))
title<-c('Mean','Median','Max','Min')
rbind(title,val)
## [,1] [,2] [,3] [,4]
## title "Mean" "Median" "Max" "Min"
## val "31932.5340529801" "27481.625" "117271.12" "-22212.609"
colnames(daily)<-c("Date","SalesVolume")
#Daily sales volume plot
ggplot(daily, aes(Date, SalesVolume, group = 1)) + geom_line() + ggtitle("Daily total sales volume")
Sales volume appears to be highly volatile by day, with the mean being at 32,000 and median at 27,000 pounds per day, a few outliers however increasing to up to 117,000 pounds per day (maybe a sale?) and -22000, maybe just after Christmas when everyone is returning their presents.
#Sales volume by month and year
ggplot(monthly_yearly, aes(x=MonthYear, y=Sales)) + geom_bar(stat="identity", fill = monthly_yearly$Year) +ggtitle("Monthly/ Yearly sales volume")
There appears to be a clear monthly pattern with increasing sales towards the end of the year (and the Christmas season).
Last months’ revenue share by product and by customer
retail[retail$MonthYear=='2011-12',] ->lastMonth
lastMonth<-lastMonth[!(lastMonth$StockCode=="DOT" | lastMonth$StockCode=="CRUK" | lastMonth$StockCode=="POST" | lastMonth$StockCode=="C2" | lastMonth$StockCode=="AMAZONFEE"),] #Remove postage, fees and other sales unrelated expenses
by_cust <- lastMonth %>%
group_by(Customer.ID) %>%
summarise(TotalSales=sum(Sales)) %>%
arrange(desc(TotalSales)) %>%
mutate(Index = 1:n(), Customer.ID = ifelse(Index > 10, "Others", Customer.ID))
#10 largest customers
ggplot(by_cust[!(by_cust$Customer.ID=="Others" | is.na(by_cust$Customer.ID)),], aes(x = reorder(Customer.ID, -TotalSales), y = TotalSales)) + geom_bar(stat = "identity", aes(fill = Customer.ID)) +
ggtitle("Last months’ revenue share by customer (10 largest customers)")
#all customers
ggplot(by_cust, aes(x = reorder(Customer.ID, -TotalSales), y = TotalSales)) +
geom_bar(stat = "identity", aes(fill = Customer.ID)) +
ggtitle("Last months’ revenue share by customer (all customers)")
Sales appear to be highly fragmented, there is no one big customer.
by_prod <- lastMonth %>%
group_by(StockCode) %>%
summarise(TotalSales=sum(Sales)) %>%
arrange(desc(TotalSales)) %>% mutate(Index = 1:n(), StockCode = ifelse(Index > 10, "Others", StockCode))
#10 products with highest revenue share
ggplot(by_prod[!(by_prod$StockCode=="Others"),], aes(x = reorder(StockCode, -TotalSales), y = TotalSales)) +
geom_bar(stat = "identity", aes(fill = StockCode))+
ggtitle("Last months’ revenue share by product for 10 best selling products")
#All products
ggplot(by_prod, aes(x = reorder(StockCode, -TotalSales), y = TotalSales)) +
geom_bar(stat = "identity", aes(fill = StockCode))+
ggtitle("Last months’ revenue share by product")
sum(by_prod$TotalSales) #Total sales this month
## [1] 441393.9
Again, sales appear to be highly fragmented by product type, each having monthly sales between 100-10000 pounds, out of total sales of 441393.9 pounds.
Weighted average monthly sale price by volume
retail %>% group_by(MonthYear) %>% summarise(vwap = sum(Sales)/sum(Quantity)) -> VWAP
ggplot(VWAP, aes(MonthYear, vwap, group = 1)) + geom_line() + ggtitle("Weighted average monthly sale price by volume")
There is no obvious trend in the weighted average monthly sale price by volume, it fluctuates between 1.60 to 2.20 pounds.
retail <-read.csv("/Users/lawrence/Google Drive/DS/Time Series/OnlineRetailUCI2-master/retail.csv")
A logical to filter these returns out would be to match them with the initial sales order. This cannot be found directly from the invoice, therefore we would need to match it by customer id, stockcode, absolute value of sales and roughly the same time period.
a)The owner of the online retailer wants to know how much revenue to expect for this month (12/2011), to help him decide what sports car he buys his partner for Christmas.
Outline a few different solutions you might suggest that solve this problem. Include in your description:
What metrics/values you might want to use:
I would suggest using combined total sales volume, be it monthly, weekly or daily, as the response variable. This is far better, easier and more accurate to model (due to CLT and large numbers) than individual purchases by specific customers or products. Doing so would result in a far more complicated model that will not necessarily result in more accurate predictions, especially considering the highly fragmented nature of sales by individual customers or products. I will therefore only look at aggregate sales volume as a response variable, and use the Date as an explanatory variable to capture the underlying trend, and Month to capture recurring monthly sales patterns.
How you would aggregate those metrics:
The aggregation could be done using the aggregate function, or summarize using tidyverse.
What model/algorithm or logic you would use to make a prediction on them:
I would use the linear model lm() function in r with totalsales (monthly, daily or weekly) as the response and Date and as.factor(Month) as the explanatory variables.
What uncertainties you might need to explore I would then look at confidence intervals, r^2, residuals and other goodness of fit measures to determine if this is a good fit. If not, another option would be to use the time series functionalities in R with ts() or holtwinters() functions, or potentially to use a glm() as maybe a poisson or other distribution is better suited, or taking the log of the response variable might be an option to improve it as well given that it is sales data.
Looking at the monthly patterns in total sales volume observed in the two previous years, December was the month with the second highest sales in 2010, unless there is a strong reason to suggest otherwise I would expect a similar pattern in 2011. Additionally, overall sales growth will need to be taken into account to come up with an accurate estimation for this years December sales. We also have data for this year’s December of up to the 9th of December. This could be useful for estimation by comparing it with how the last two years full-December sales data compared with sales up until the 9th of December.
In addition, something like a profit margin, COGS and tax estimation would be helpful to determine if the owner can afford the Ferrari or not as just simple sales revenue does not say too much about the owners’ bottom line.
For starters, I think the lm() is the best approach to start as it is the most simple and easy to implement one, while likely still yielding reasonably good accuracy.
aggregate(Sales ~ MonthYear, data=retail, FUN=sum) %>% separate(MonthYear,into=c('Year','Month')) %>% select(-c(Year)) -> sales
sales <- sales[-25,] #Remove last incomplete December observation
monthly.lm <- lm(Sales ~ Month, data=sales)
summary(monthly.lm)
##
## Call:
## lm(formula = Sales ~ Month, data = sales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157235 -34335 0 34335 157235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 592017 60205 9.833 4.29e-07 ***
## Month02 -76440 85143 -0.898 0.386963
## Month03 132541 85143 1.557 0.145513
## Month04 -50123 85143 -0.589 0.566987
## Month05 77312 85143 0.908 0.381742
## Month06 93438 85143 1.097 0.293989
## Month07 36252 85143 0.426 0.677810
## Month08 77712 85143 0.913 0.379362
## Month09 344652 85143 4.048 0.001616 **
## Month10 465920 85143 5.472 0.000142 ***
## Month11 850189 85143 9.985 3.64e-07 ***
## Month12 377194 85143 4.430 0.000821 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 85140 on 12 degrees of freedom
## Multiple R-squared: 0.9483, Adjusted R-squared: 0.9008
## F-statistic: 19.99 on 11 and 12 DF, p-value: 4.945e-06
monthly.pred <- predict(monthly.lm, newdata = data.frame(Month='12'), se.fit = T)
lower<-monthly.pred$fit-1.96*monthly.pred$se.fit
upper<-monthly.pred$fit+1.96*monthly.pred$se.fit
cbind(monthly.pred$fit,lower,upper)
## lower upper
## 1 969210.4 851208 1087213
Just looking at historical monthly averages, we’d expect December sales to lie between 857729.8 and 1091312 pounds with 95% confidence.
daily.sales <- retail %>%
select(Sales, MonthYear, Date) %>%
separate(MonthYear,into=c('Year','Month')) %>%
select(-c(Year)) %>%
group_by(Date) %>%
summarise(DailySales=sum(Sales), Month) #Creates Month variable and aggregates daily sales
## `summarise()` has grouped output by 'Date'. You can override using the `.groups` argument.
daily.sales <- unique(daily.sales) #Remove redundant rows
daily.sales$Date<- as.Date(daily.sales$Date) #Convert date to Date object
daily.sales<-daily.sales[-604,] #Exclude the last day as sales don't appear to be complete for that day
daily.lm<- lm(DailySales ~ Date + Month, data=daily.sales)
summary(daily.lm)
##
## Call:
## lm(formula = DailySales ~ Date + Month, data = daily.sales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45823 -7534 -920 5929 72075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -69356.212 48194.239 -1.439 0.15065
## Date 6.349 3.251 1.953 0.05130 .
## Month02 -3362.758 3088.992 -1.089 0.27676
## Month03 1801.455 3006.525 0.599 0.54928
## Month04 -543.902 3167.746 -0.172 0.86373
## Month05 1874.935 3097.529 0.605 0.54521
## Month06 748.186 3066.365 0.244 0.80732
## Month07 -1643.851 3083.512 -0.533 0.59416
## Month08 -242.390 3103.767 -0.078 0.93778
## Month09 9825.167 3127.820 3.141 0.00177 **
## Month10 14296.506 3154.047 4.533 7.05e-06 ***
## Month11 28883.529 3183.082 9.074 < 2e-16 ***
## Month12 23836.384 3090.205 7.714 5.24e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15130 on 590 degrees of freedom
## Multiple R-squared: 0.3331, Adjusted R-squared: 0.3195
## F-statistic: 24.55 on 12 and 590 DF, p-value: < 2.2e-16
Looking at this model, we can see that there appears to be a (weak) sales growth trend, on average sales tend to increase by about 5 pounds on every calendar day. This trend is not very significant however (P-value = 0.08). Using this model to predict remaining sales in December:
month<-c(rep('12',23))
date<-c('2011-12-09','2011-12-10','2011-12-11','2011-12-12','2011-12-13','2011-12-14','2011-12-15','2011-12-16','2011-12-17','2011-12-18','2011-12-19','2011-12-20','2011-12-21','2011-12-22','2011-12-23','2011-12-24','2011-12-25','2011-12-26','2011-12-27','2011-12-28','2011-12-29','2011-12-30','2011-12-31')
dec<-data.frame(date,month)
names(dec) <- c('Date', 'Month')
dec$Date<-as.Date(dec$Date)
daily.pred <- predict(daily.lm, newdata = dec, se.fit = T)
lower<-daily.pred$fit-1.96*daily.pred$se.fit
upper<-daily.pred$fit+1.96*daily.pred$se.fit
prev.sales<- retail %>% filter((Date>'2011-11-30' & Date < '2011-12-09')) %>% summarise(TotalSales = sum(Sales)) #Total sales so far in the up to the 12.12.2011, excluding the last day which does not appear to be complete
prev.sales<-prev.sales[[1]]
cbind(sum(daily.pred$fit)+prev.sales,sum(lower)+prev.sales,sum(upper)+prev.sales) #Predictions for total sales this december, considering existing sales data from the first 8 days
## [,1] [,2] [,3]
## [1,] 1592949 1471916 1713983
last.dec<- retail %>% filter((Date>'2010-11-30' & Date < '2011-01-01')) %>% summarise(TotalSales = sum(Sales)) #Last years december total sales
last.dec[[1]]
## [1] 1126445
This model taking into account the growth trend would predict total sales to lie between 1461205 and 1699748 pounds. This figure seems quite high, especially considering last years total December sales were only 1126445. The model is likely putting too much weight on the sales increase from December 2009 to December 2010.
this.dec.frac <-retail %>% filter((Date>'2011-11-30' & Date < '2011-12-09')) %>% summarise(TotalSales = sum(Sales)) #Dec sales until the 9th this year
last.dec.frac <-retail %>% filter((Date>'2010-11-30' & Date < '2010-12-09')) %>% summarise(TotalSales = sum(Sales)) #Dec sales until the 9th last year
this.dec.frac[[1]] #Total sales this year in the first eight December days
## [1] 401536.5
last.dec.frac[[1]] #Total sales last year in the first eight December days
## [1] 649912.6
frac<- last.dec.frac[[1]]/last.dec[[1]] #Fraction of total December sales in the first eight days
frac
## [1] 0.5769588
this.dec.frac[[1]]/frac #Extrapolating this percentage on this years first eight days sales.
## [1] 695953.5
Another interesting point is that total sales in the first eight days of December last year were about 650,000 pounds, while this year this figure is only at about 401,000 pounds. Given that last year 57% of December sales occured in these first eight days, this may be cause for concern in accuracy of the model, especially the second one. Looking purely at these numbers, we would only expect about 700,000 pounds of total December sales this year. However, this might just be a coincidence, maybe Christmas shopping is just left a bit late in 2011 compared to 2010.
How confident are you of this forecast - do you back your prediction enough to recommend the new Ferrari?
Based on these estimates, I would give a conservative estimate of Sales to be between 700,000 to 900,000 pounds. Depending on the owners profit margin, this could be enough for the Ferrari, but to be on the cautious side maybe a Toyota would be a better choice for this year.
While the linear model is a very useful multi-purpose tool, let’s now see how a few more specialized approaches designed for time series analysis compare instead. We will be looking at exponential smoothing and models from the ARIMA family.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tsibble)
##
## Attaching package: 'tsibble'
## The following object is masked from 'package:lubridate':
##
## interval
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(fpp3)
## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
## ✓ tsibbledata 0.3.0 ✓ fable 0.3.1
## ✓ feasts 0.2.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval() masks lubridate::interval()
## x dplyr::lag() masks stats::lag()
## x tsibble::setdiff() masks base::setdiff()
## x tsibble::union() masks base::union()
library(zoo)
##
## Attaching package: 'zoo'
## The following object is masked from 'package:tsibble':
##
## index
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
#Unfortunately, there appear to be some missing days with no sales. Since this leads to problems with some
#forecasting methods, I will impute these missing days using complete() and na.approx(). This fills in missing
#days' sales with the average of the previous and the following days' sales.
retail %>%
select(Sales, Date, MonthYear) %>%
separate(MonthYear,into=c('Year','Month')) %>%
select(-c(Year)) %>% group_by(Date) %>%
summarise(Sales=sum(Sales),Month = as.integer(Month)) %>% unique() -> daily.sales
## `summarise()` has grouped output by 'Date'. You can override using the `.groups` argument.
daily.sales %>%
mutate(Date = as.Date(Date)) %>%
complete(Date = seq.Date(min(Date), max(Date), by="day")) -> daily.sales
daily.sales <- as_tsibble(daily.sales,index=Date) %>% fill_gaps()
daily.sales$Sales <- na.approx(daily.sales$Sales)
daily.sales$Month <- na.approx(daily.sales$Month)
autoplot(daily.sales, Sales) +
labs(y = "Daily sales in dollars",
title = "Daily sales")
Again, as noted before, there do appear to be some seasonal patterns going on but no clear upwards or downwards trend. Let’s see what a model decomposition into trend, seasonal and cyclical components can reveal about the data. We have the choice between using an additive or multiplicative model, depending on the structure of the data. Given that we don’t see a clear trend in the data, mostly seasonal variation, an additive model seems appropriate for now.
dcmp <- daily.sales %>%
model(
classical_decomposition(Sales, type = "additive")
)
components(dcmp)
## # A dable: 739 x 7 [1D]
## # Key: .model [1]
## # : Sales = trend + seasonal + random
## .model Date Sales trend seasonal random season_adjust
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "classical_decomposi… 2009-12-01 54177. NA 5791. NA 48386.
## 2 "classical_decomposi… 2009-12-02 63217. NA 1762. NA 61455.
## 3 "classical_decomposi… 2009-12-03 73959. NA 7575. NA 66384.
## 4 "classical_decomposi… 2009-12-04 40624. 44466. -875. -2966. 41500.
## 5 "classical_decomposi… 2009-12-05 9803. 43504. -6409. -27292. 16212.
## 6 "classical_decomposi… 2009-12-06 24482. 40244. -11524. -4237. 36007.
## 7 "classical_decomposi… 2009-12-07 44999. 35870. 3680. 5448. 41318.
## 8 "classical_decomposi… 2009-12-08 47444. 35556. 5791. 6097. 41653.
## 9 "classical_decomposi… 2009-12-09 40397. 38476. 1762. 158. 38634.
## 10 "classical_decomposi… 2009-12-10 43342. 38131. 7575. -2364. 35767.
## # … with 729 more rows
components(dcmp) %>% autoplot()
The decompostion appears to confirm our initial observation of the absence of any significant, multi-year trend. There does however seem to be a strong yearly, cyclical pattern.
A good benchmark for time series data to compare more complex models to is the naive model, which simply predicts that future values will be equal to the most recent observation. There is also a seasonal naive variant, which does the same but adjusted for seasonality. Let’s fit these two simple models and see how we can improve on them further. I will use the last 30 days of data as a test set and the remainder as training set:
#Split into training and test set
train <- daily.sales %>%
filter_index(~ as.character(max(daily.sales$Date) - 30))
test <- daily.sales %>%
filter_index(as.character(max(daily.sales$Date) - 29) ~ .)
#Fit naive and seasonal naive models
sales_fit_naive <- train %>% model(`Naïve` = NAIVE(Sales))
sales_fit_snaive <- train %>% model(`Seasonal naïve` = SNAIVE(Sales))
# Generate forecasts for 30 days
sales_fc_naive <- sales_fit_naive %>% forecast(h = 30)
sales_fc_snaive <- sales_fit_snaive %>% forecast(h = 30)
# Plot forecasts against actual values
sales_fc_naive %>%
autoplot(train, level = NULL) +
autolayer(
test,
colour = "grey", alpha=0.6
) +
labs(
y = "Daily Sales (in Pounds)",
title = "Forecasts for final 30 days of sales"
) +
guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
## Plot variable not specified, automatically selected `.vars = Sales`
sales_fc_snaive %>%
autoplot(train, level = NULL) +
autolayer(
test,
colour = "grey", alpha=0.6
) +
labs(
y = "Daily Sales (in Pounds)",
title = "Forecasts for final 30 days of sales"
) +
guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
## Plot variable not specified, automatically selected `.vars = Sales`
Not a bad first model. Let’s take a look at the residuals to see if there is more data we can capture in our model:
daily.sales %>%
model(NAIVE(Sales)) %>%
gg_tsresiduals()
daily.sales %>%
model(NAIVE(Sales)) %>% augment() %>% features(.innov, ljung_box, lag = 730, dof = 0)
## # A tibble: 1 x 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 NAIVE(Sales) 2928. 0
Both the acf plot and the ljung_box test indicate that there is in fact significant auto-correlation in the residuals, indicating that we should proceed and try fit a more complex model. Let’s see how they behave for the seasonal naive model.
daily.sales %>%
model(SNAIVE(Sales)) %>%
gg_tsresiduals()
daily.sales %>%
model(SNAIVE(Sales)) %>% augment() %>% features(.innov, ljung_box, lag = 730, dof = 0)
## # A tibble: 1 x 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 SNAIVE(Sales) 1188. 0
Again, we see significant auto-correlation with the lags, and a highly significant p-value in the ljung box test. Let’s proceed with a few more complex models. Let’s take a look at some performance metrics, in particular RMSE and MAE.
rbind(accuracy(sales_fc_naive, daily.sales),accuracy(sales_fc_snaive, daily.sales))
## # A tibble: 2 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Naïve Test -8320. 20213. 15954. -30.8 39.4 1.35 1.19 0.0885
## 2 Seasonal naïve Test -4025. 14868. 12173. -16.9 26.6 1.03 0.877 0.0826
We can see that the SNaive model has a much lower error than the naive one. Let’s see how a few more complex models compare to this.
sales_lm <- train %>%
model(TSLM(Sales ~ trend() + season())) #Trend + day of the week
report(sales_lm)
## Series: Sales
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -57171 -9615 -3251 6736 86395
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33517.289 1845.132 18.165 < 2e-16 ***
## trend() 9.604 2.852 3.368 0.000799 ***
## season()week2 -8812.329 2186.869 -4.030 6.20e-05 ***
## season()week3 -14094.636 2186.875 -6.445 2.15e-10 ***
## season()week4 -18926.444 2186.884 -8.655 < 2e-16 ***
## season()week5 -4703.573 2186.897 -2.151 0.031833 *
## season()week6 -2066.016 2181.505 -0.947 0.343935
## season()week7 -6155.879 2181.513 -2.822 0.004910 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15540 on 701 degrees of freedom
## Multiple R-squared: 0.1507, Adjusted R-squared: 0.1423
## F-statistic: 17.77 on 7 and 701 DF, p-value: < 2.22e-16
sales_lm_fc <- sales_lm %>% forecast(h = 30)
sales_lm_fc %>%
autoplot(train) +
labs(
title = "Forecasts of daily sales using regression",
y = "Sales in Pounds"
)
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
The linear model appears to be putting too much weight on the average sales value from earlier in the year.
accuracy(sales_lm_fc,daily.sales)
## # A tibble: 1 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TSLM(Sales ~ trend… Test 21399. 26136. 21465. 35.3 35.6 1.82 1.54 -0.0412
The linear model’s prediction therefore is quite far off, performing even worse than the naive model.
Let’s take a look at how a model from the exponential smoothing (ETS) family of models performs instead. With Holt-Winters models, we have the choice of selecting either an additive or multiplicative model. The right choice here depends on whether there exists a (exponentially increasing) trend or the data looks rather flat. In this case, either model could work as the graph looks mostly flat, but there appears to be a slight trend in the later half of the year. Let’s try fit both models, as well as an automatically determined model.
hw_fit <- train %>%
model(
additive = ETS(Sales ~ error("A") + trend("A") + season("A")),
multiplicative = ETS(Sales ~ error("M") + trend("A") + season("M")),
auto = ETS(Sales)
)
report(hw_fit)
## # A tibble: 3 x 9
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 additive 1.49e+8 -8993. 18009. 18010. 18064. 1.47e8 1.63e8 8.38e+3
## 2 multiplica… 1.78e-1 -8922. 17867. 17867. 17922. 1.45e8 1.59e8 3.06e-1
## 3 auto 1.48e+8 -8991. 18002. 18002. 18048. 1.46e8 1.61e8 8.39e+3
All three information criteria (AIC,AICc and BIC) seem pretty close to each other for all three models. Let’s see how their forecasts compare.
fc_hw <- hw_fit %>% forecast(h = 30)
fc_hw %>%
autoplot(train, level = NULL) +
autolayer(
test,
colour = "grey", alpha=0.6
) +
labs(
y = "Daily Sales (in Pounds)",
title = "Holt-Winters forecasts for final 30 days of sales"
) +
guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
## Plot variable not specified, automatically selected `.vars = Sales`
Both models appear to be quite similar to the seasonal naive approach, without any strong trend or cyclical component. This perhaps makes sense since we were not able to identify any significant trends in the training data.
accuracy(fc_hw, daily.sales)
## # A tibble: 3 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 additive Test -4283. 15979. 12118. -18.9 27.8 1.03 0.943 -0.0107
## 2 auto Test -2705. 15508. 11484. -15.5 25.7 0.975 0.915 -0.0218
## 3 multiplicative Test -3214. 15055. 10363. -11.7 20.1 0.880 0.888 -0.192
All three models are a clear improvement to the naive models, which is a great start. Our intuitive preference for choosing an additive over a multiplicative model seems to be confirmed by the data, with the additive model yielding a RMSE of 13212 and the multiplicative one of 14443. The automatically determined model however performs better than both of them, with a RMSE of 11610.
Let’s see how another popular group, the ARIMA family of models, performs on this dataset. ARIMA is another very popular approach for time series forecasting. The idea here is, as opposed to exponential smoothing where the predictions are based on trend and seasonality of the data, to make predictions based on autocorrelation of future observations with the past.
The ARIMA class of models requires time series to be made stationary in order to make useful predictions. Stationarity refers to the series not containing any trend or seasonality, and rather just have random white noise or unpredictable cycles. Variance should also be roughly constant, if it is not a transformation such as taking the logarithm of the series or a Box-Cox transformation can be used.
In many time series, this is not the case. In these situations, a technique called differencing can be used to make a series stationary. There are two common forms of differencing: taking the first order difference, which simply refers to subtracting the previous observation from every observation. This makes the time series essentially a series of changes from the last observation. In the example of stock prices, this means instead of a time series of stock prices it simply becomes a time series of daily stock price changes, which in a lot of cases tend to be stable. The second common form is taking the seasonal difference, if there is a strong seasonal pattern. If the pattern is quarterly, for example, this means subtracting the most recent quarter from each quarterly observation, making the time series essentially a series of quarterly changes.
Checking for stationarity can be done by inspecting the time series plot, as well as looking at ACF and PACF plots. Having seen the time plot above, it appears that differencing might be a good idea. Let’s take a look at the first order difference:
daily.sales %>% gg_tsdisplay(difference(Sales) , plot_type='partial')
It appears that there is a weekly pattern that the data is autocorrelated with. Let’s also take the weekly difference now:
daily.sales %>%
gg_tsdisplay(difference(Sales, 7) %>% difference(),
plot_type='partial') +
labs(title = "Double differenced", y="")
Looking at the plot, a 1st order MA model with a seasonal MA(1) might be appropriate. Let’s see how this works. I will try fit a ARIMA(0,1,2)(0,1,1) model first, and also see how an automatically fitted model performs as well.
fit <- train %>%
model(
arima012011 = ARIMA(Sales ~ pdq(0,1,2) + PDQ(0,1,1)),
arima210011 = ARIMA(Sales ~ pdq(2,1,0) + PDQ(0,1,1)),
arima312011 = ARIMA(Sales ~ pdq(3,1,2) + PDQ(0,1,1)),
auto = ARIMA(Sales, stepwise = FALSE, approx = FALSE) #By setting stepwise and approx to false we make the algorithm work a little harder to find the right model
)
fc_arima <- fit %>% forecast(h=30)
fc_arima %>%
autoplot(train, level = NULL) +
autolayer(
test,
colour = "grey", alpha=0.6
) +
labs(
y = "Daily Sales (in Pounds)",
title = "Forecasts for final 30 days of sales (ARIMA forecast)"
) +
guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
## Plot variable not specified, automatically selected `.vars = Sales`
accuracy(fc_arima, daily.sales)
## # A tibble: 4 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima012011 Test -3351. 15393. 11228. -16.1 25.0 0.953 0.908 -0.0433
## 2 arima210011 Test -6913. 16608. 13163. -23.4 30.2 1.12 0.980 -0.0392
## 3 arima312011 Test 2608. 14856. 9886. -3.26 19.0 0.839 0.877 -0.0904
## 4 auto Test 4378. 17792. 13323. -3.10 26.9 1.13 1.05 0.132
The arima312011 model has a RMSE of 14856 on the test data. Interestingly, the automatically selected model performs the worst. Letting the computer do all the thinking sometimes can be dangerous it appears.
I will introduce one more model into this analysis, which is based on fitting a neural network on the lags of the data to predict future values. I will be using the NNETAR() function for this:
train %>%
model(nn102 = NNETAR(Sales,p=10,P=2),
nnauto = NNETAR(Sales)
) -> nn_fit
fc_nn <- nn_fit %>% forecast(h = 30)
fc_nn %>%
autoplot(train, level = NULL) +
autolayer(
test,
colour = "grey", alpha=0.6
) +
labs(
y = "Daily Sales (in Pounds)",
title = "Forecasts for final 30 days of sales (Neural Net forecast)"
) +
guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
## Plot variable not specified, automatically selected `.vars = Sales`
Visually it appears as if both models quite nicely capture some of the seasonal spikes in the data. However, they appear to be falling down a bit too fast again.
accuracy(fc_nn, daily.sales)
## # A tibble: 2 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 nn102 Test 6280. 22974. 18879. 3.19 34.8 1.60 1.36 0.499
## 2 nnauto Test 3170. 19369. 16101. -2.09 29.7 1.37 1.14 0.338
The accuracy of the Neural network unfortunately isn’t quite as high as the graph might have suggested. I will be sticking to the exponential smoothing model for my final forecast.
final_fit <- daily.sales[1:738,] %>% model(ETS(Sales))
report(final_fit)
## Series: Sales
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.277415
## gamma = 0.0001002125
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 43290.35 3675.734 -9856.863 -6413.32 -880.5316 7569.498 1757.187 4148.296
##
## sigma^2: 152421520
##
## AIC AICc BIC
## 18790.17 18790.47 18836.21
fc_ets <- final_fit %>% forecast(h=23)
fc_ets %>%
autoplot(daily.sales, level = NULL) + labs(
y = "Daily Sales (in Pounds)",
title = "Final forecast for December 2011 sales (ETS)"
) +
guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
Let’s take a look back at our initial linear model and see how this compares:
Let’s come back to our linear model that I fitted last time and see how this compares to these specialised time series approaches. I will refit the model using the same training/ test split that I have used for the other two sets to ensure comparability.
pred_df <- data.frame(date,daily.pred$fit) %>%
mutate(date = as.Date(date))
pred_df
## date daily.pred.fit
## 1 2011-12-09 51730.72
## 2 2011-12-10 51737.07
## 3 2011-12-11 51743.42
## 4 2011-12-12 51749.76
## 5 2011-12-13 51756.11
## 6 2011-12-14 51762.46
## 7 2011-12-15 51768.81
## 8 2011-12-16 51775.16
## 9 2011-12-17 51781.51
## 10 2011-12-18 51787.86
## 11 2011-12-19 51794.21
## 12 2011-12-20 51800.56
## 13 2011-12-21 51806.91
## 14 2011-12-22 51813.26
## 15 2011-12-23 51819.61
## 16 2011-12-24 51825.95
## 17 2011-12-25 51832.30
## 18 2011-12-26 51838.65
## 19 2011-12-27 51845.00
## 20 2011-12-28 51851.35
## 21 2011-12-29 51857.70
## 22 2011-12-30 51864.05
## 23 2011-12-31 51870.40
autoplot(daily.sales, Sales) +
labs(y = "Daily sales in dollars",
title = "Daily sales") +
autolayer(
as_tsibble(pred_df),
colour = "blue"
)
## Using `date` as index variable.
## Plot variable not specified, automatically selected `.vars = daily.pred.fit`
Visually, the linear model looks very similar to the naive model which simply takes the last observation as its forecasted value.
# #Split into training and test set
train <- daily.sales[1:708,]
test <- daily.sales[709:738,]
test %>% select(-c(Sales)) -> newdata
fit <- train %>%
model(
lm = TSLM(Sales ~ Month + Date)
)
newdata
## # A tsibble: 30 x 2 [1D]
## # Groups: @ Date [30]
## Date Month
## <date> <dbl>
## 1 2011-11-09 11
## 2 2011-11-10 11
## 3 2011-11-11 11
## 4 2011-11-12 11
## 5 2011-11-13 11
## 6 2011-11-14 11
## 7 2011-11-15 11
## 8 2011-11-16 11
## 9 2011-11-17 11
## 10 2011-11-18 11
## # … with 20 more rows
fc <- forecast(fit, new_data = newdata)
train %>%
autoplot(Sales) +
autolayer(fc) +
labs(title = "Daily Sales", y = "Sales (in pounds)")
accuracy(fc, daily.sales)
## # A tibble: 1 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 lm Test 15338. 23674. 18718. 18.6 32.1 1.59 1.40 0.171
Our RMSE goodness of fit statistic confirms this visual impression. The exponential smoothing approach, purpose built for time series forecasting, cutting the linear model’s RMSE almost in half.
prev.sales<- retail %>% filter((Date>'2011-11-30' & Date < '2011-12-09')) %>% summarise(TotalSales = sum(Sales)) #Total sales so far in the up to the 12.12.2011, excluding the last day which does not appear to be complete
prev.sales<-prev.sales[[1]]
print("Total projected sales for December 2011 using ETS:");sum(fc_ets$.mean)+prev.sales
## [1] "Total projected sales for December 2011 using ETS:"
## [1] 1793260
Using the exponential smoothing forecast model we come to a total projected sales figure of 1,793,260 pounds.